# 2022.3.23
# from 2018 Zhang Cell paper 
library(Seurat)
library(tidyverse)
library(tidyseurat)

cnts <- data.table::fread("CRC-I/data/GSE108989_CRC.TCell.S11138.count.txt.gz")

cnts[1:5,1:5]
# find cells belong to our genotyped patients
data <- select(cnts, matches("symbol|0215|0701|0909|1012"))
# SeuratObject input need feature as rows and sample as columns
# Seurat do not allow duplicate feature names
data <- distinct(data, symbol, .keep_all = TRUE)

sobj <- data %>%
  filter(!is.na(symbol)) %>%
  column_to_rownames('symbol') %>%
  CreateSeuratObject(names.delim = '-',
                     min.cells = 3,
                     min.features = 200)

sobj

# load metadata
meta <- read_delim("CRC-I/data/crc2018.meta.txt")

# assign I232T genotype in meta
meta <- meta %>%
  filter(Patient_ID %in% c("P0215","P0701","P0909","P1012")) %>%
  mutate(genotype = case_when(
  str_detect(Patient_ID, 'P0701|P1012') ~ 'II',
  Patient_ID == 'P0909' ~ 'IT',
  Patient_ID == 'P0215' ~ 'TT'
)) %>%
  mutate(tissue = case_when(
    str_starts(sampleType, 'T') ~ 'Tumor',
    str_starts(sampleType, 'P') ~ 'Blood',
    str_starts(sampleType, 'N') ~ 'Normal'
  ))

sobj <- meta |>
  rename(.cell = `UniqueCell_ID `) |>
  left_join(x = sobj, y = _)

# save genotyped data
write_rds(sobj, 'CRC-I/data/zhang2018_Count.rds')

meta |>
  select(c('majorCluster', 'tissue', 'genotype', 'UniqueCell_ID ')) |>
  write_csv('CRC-I/data/zhang2018_meta.csv')

